# -*- coding: utf-8 -*-
"""PCA.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/10EFDEuxsqNMhsDxsyd3zUeMeUifFrWnm
"""

import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Generate some synthetic data
np.random.seed(42)
X = np.random.randn(100, 2)
# Introduce correlation
X[:, 1] = 2 * X[:, 0] + 0.5 * np.random.randn(100)

print("Original Data Shape:", X.shape)
print("First 5 rows of original data:\n", X[:5])

# Visualize the original data
plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], alpha=0.7)
plt.title('Original 2D Data')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.grid(True)
plt.show()

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("\nScaled Data Shape:", X_scaled.shape)
print("First 5 rows of scaled data:\n", X_scaled[:5])

# Visualize the scaled data (often looks similar in terms of shape, but axes are standardized)
plt.figure(figsize=(8, 6))
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], alpha=0.7)
plt.title('Scaled 2D Data')
plt.xlabel('Scaled Feature 1')
plt.ylabel('Scaled Feature 2')
plt.grid(True)
plt.show()

# Apply PCA
# n_components can be an integer (number of components to keep)
# or a float between 0 and 1 (fraction of variance to explain)
pca = PCA(n_components=1)
X_pca = pca.fit_transform(X_scaled)

print("\nTransformed Data Shape (PCA to 1D):", X_pca.shape)
print("First 5 rows of PCA transformed data:\n", X_pca[:5])

# We can also get the explained variance ratio
print("\nExplained variance ratio by each component:", pca.explained_variance_ratio_)
print("Total explained variance:", np.sum(pca.explained_variance_ratio_))

# The components themselves (eigenvectors)
print("\nPrincipal Components (eigenvectors):\n", pca.components_)
# pca.components_ will have shape (n_components, n_features_original)
# For n_components=1 and n_features_original=2, it will be (1, 2)
# The first row is the first principal component.

# Visualize the transformed data (now 1D, so we'll plot it along an axis)
plt.figure(figsize=(8, 6))
plt.scatter(X_pca, np.zeros_like(X_pca), alpha=0.7) # Plot on a single line for 1D
plt.title('PCA Transformed Data (1D)')
plt.xlabel('Principal Component 1')
plt.yticks([]) # No y-axis labels needed
plt.grid(True)
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], alpha=0.7, label='Scaled Data')

# Plot the principal component as a line
# The components are unit vectors, so we scale them for better visualization
# The line passes through the origin (mean of centered data)
component = pca.components_[0]
# Scale the component vector for visualization
scale_factor = 2 # Adjust as needed for better visualization
plt.plot([-component[0]*scale_factor, component[0]*scale_factor],
         [-component[1]*scale_factor, component[1]*scale_factor],
         'r--', label='Principal Component 1')

plt.title('Scaled Data with Principal Component')
plt.xlabel('Scaled Feature 1')
plt.ylabel('Scaled Feature 2')
plt.grid(True)
plt.axhline(0, color='grey', linestyle='--', linewidth=0.8)
plt.axvline(0, color='grey', linestyle='--', linewidth=0.8)
plt.legend()
plt.show()

# Let's use a dataset with more features, e.g., the Iris dataset
from sklearn.datasets import load_iris

iris = load_iris()
X_iris = iris.data
y_iris = iris.target
feature_names = iris.feature_names

print("\nIris Data Shape:", X_iris.shape)
print("Iris Feature Names:", feature_names)

# Standardize the Iris data
scaler_iris = StandardScaler()
X_iris_scaled = scaler_iris.fit_transform(X_iris)

# Apply PCA with all components to see explained variance
pca_full = PCA(n_components=None) # n_components=None keeps all components
pca_full.fit(X_iris_scaled)

explained_variance_ratio = pca_full.explained_variance_ratio_
print("\nExplained variance ratio for Iris data (all components):\n", explained_variance_ratio)

# Plotting the explained variance
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', linestyle='--')
plt.title('Explained Variance Ratio by Principal Component')
plt.xlabel('Number of Principal Components')
plt.ylabel('Explained Variance Ratio')
plt.xticks(range(1, len(explained_variance_ratio) + 1))
plt.grid(True)
plt.show()

# Plotting cumulative explained variance
cumulative_explained_variance = np.cumsum(explained_variance_ratio)
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, marker='o', linestyle='--')
plt.title('Cumulative Explained Variance by Principal Component')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.xticks(range(1, len(cumulative_explained_variance) + 1))
plt.grid(True)
plt.show()

# Based on the cumulative explained variance, you can decide how many components
# to keep. For example, if you want to retain 95% of the variance:
n_components_95 = np.where(cumulative_explained_variance >= 0.95)[0][0] + 1
print(f"\nNumber of components to retain 95% variance: {n_components_95}")

# Now, apply PCA with the chosen number of components
pca_final = PCA(n_components=n_components_95)
X_iris_pca = pca_final.fit_transform(X_iris_scaled)

print("\nIris Data transformed with PCA (chosen components) shape:", X_iris_pca.shape)
print("First 5 rows of Iris PCA transformed data:\n", X_iris_pca[:5])

# Visualize the 2D PCA transformed Iris data (if n_components_95 is 2)
if n_components_95 >= 2:
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(X_iris_pca[:, 0], X_iris_pca[:, 1], c=y_iris, cmap='viridis', alpha=0.7)
    plt.title('Iris Data after PCA (2 Components)')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.colorbar(scatter, ticks=[0, 1, 2], label='Species')
    plt.grid(True)
    plt.show()